1 Setup

2 Environment

Set the working dir

setwd("/mnt/picea/projects/arabidopsis/mschmid/porcupine-RNA-Seq")

Libs

suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(Glimma))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(org.At.tair.db))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(VennDiagram))

Helper files

suppressMessages(source("~/Git/UPSCb/src/R/densityPlot.R"))
suppressMessages(source("~/Git/UPSCb/src/R/plotMA.R"))
suppressMessages(source("~/Git/UPSCb/src/R/volcanoPlot.R"))
suppressMessages(source("~/Git/UPSCb/src/R/plot.multidensity.R"))

Load saved data Read the sample information

samples <- read.csv("~/Git/UPSCb/projects/arabidopsis-porcupine-RNA-Seq/doc/RNA-seq_pcp_vs_Col-0_info.csv")

And extend it

samples$Genotype=sub(" .*","",samples$Description)
samples$Temp=ifelse(sub(".* ","",samples$Description)=="16",
                    ifelse(nchar(as.character(samples$Description))>16,"WC","C"),
                    ifelse(nchar(as.character(samples$Description))>16,"CW","W"))

Gene raw expression

cg <- read.csv("analysis/kallisto/combined-raw-unormalised-gene-expression_data.csv",
               row.names=1)

Transcript raw expression

ct <- read.csv("analysis/kallisto/combined-raw-unormalised-transcript-expression_data.csv",
               row.names=1)

Gene normalised expression

vst.g <- read.csv("analysis/kallisto/combined-library-size-normalized_variance-stabilized_gene-expression_data.csv",
               row.names=1)

Transcript raw expression

vst.t <- read.csv("analysis/kallisto/combined-library-size-normalized_variance-stabilized_transcript-expression_data.csv",
               row.names=1)

Setup graphics

pal=brewer.pal(8,"Dark2")
mar <- par("mar")

This is pcp

pcp <- "AT2G18740"

3 Function

".runDE" <- function(dds, annot, exp, alpha=0.01, lfc=0.5, outdir=NULL, type=c("gene","transcript")){
  # default
  type <- match.arg(type)
  if(type=="transcript"){
    display.columns <- c("TranscriptID", "SYMBOL","GENENAME")
    id.column <- "TranscriptID"
  } else {
    display.columns <- c("GeneID", "SYMBOL","GENENAME")
    id.column <- "GeneID"
  }
  
  # Differential Expression
  suppressMessages(dds <- DESeq(dds))
  
  # contrast
  ct <- gsub("condition","",paste(resultsNames(dds)[-1],collapse="-vs-"))
  if(is.null(outdir)){
    outdir <- ct    
  }
  
  # Dispersion Estimation
  plotDispEsts(dds)

  # Results  
  res <- results(dds,as.list(resultsNames(dds)[-1]))
  
  # Plot the Median vs Average
  plotMA(res,alpha,lfc)
  
  # Plot the log odds vs. log2 fold change
  volcanoPlot(res,alpha=alpha,lfc=lfc)
  
  # Plot the adjusted p-value histogram
  hist(res$padj,breaks=seq(0,1,.01))
  
  # Select genes below alpha and above lfc
  sel <- res$padj<alpha & !is.na(res$padj) & abs(res$log2FoldChange) >= lfc
  sprintf("There are %s genes differentially expressed at a %s FDR and a %s logFC cutoffs",
          sum(sel),alpha,lfc)
  
  # Write them out
  dir.create(outdir,showWarnings=FALSE,recursive=TRUE)
  write.csv(annot[sel,],
            file=file.path(outdir,
                           paste(ct,alpha,"FDR",lfc,
                                 "log2FC-cutoffs_significant-genes.csv",sep="-")))
  
  write.csv(annot[sel,][sign(res$log2FoldChange[sel])==1,],
            file=file.path(outdir,
                           paste(ct,alpha,"FDR",lfc,
                                 "log2FC-cutoffs_up-regulated_significant-genes.csv",sep="-")))
  
  write.csv(annot[sel,][sign(res$log2FoldChange[sel])==-1,],
            file=file.path(outdir,
                           paste(ct,alpha,"FDR",lfc,
                                 "log2FC-cutoffs_down-regulated_significant-genes.csv",sep="-")))
  
  write.csv(cbind(as.data.frame(res),annot),
            file=file.path(outdir,
                           paste0(ct,".csv")))
  
  # Feature name
  fname <- rownames(res)[sel]
  
  # Cluster the VST expression of the genes
  heatmap.2(as.matrix(exp[rownames(exp) %in% fname,colnames(exp) %in% dds$ID]),
            scale="row",labRow=NA,trace="none",
            RowSideColors=ifelse(sign(res[sel,"log2FoldChange"])==-1,
                                 "darkorange",
                                 "yellow"),
            labCol=colData(dds)$condition)
  
  # Interactive MDS and MA
  res.df <- as.data.frame(res)
  res.df$log10MeanNormCount <- log10(res.df$baseMean)
  idx <- rowSums(counts(dds)) > 0
  res.df <- res.df[idx,]
  res.df$padj[is.na(res.df$padj)] <- 1
  
  glMDPlot(res.df,
           xval = "log10MeanNormCount",
           yval = "log2FoldChange",
           counts = counts(dds)[idx,],
           anno = annot[idx,],
           groups = dds$condition,
           samples = paste(dds$condition,dds$replicate,sep="-"),
           status = sel[idx],
           display.columns = display.columns,
           id.column = id.column,
           path = outdir,
           folder = paste0(ct,"-report"),launch=FALSE)
  
  # Done
  return(fname)
}

4 Process

Re-order the samples

stopifnot(all(colnames(cg) == colnames(vst.g)))
stopifnot(all(colnames(ct) == colnames(vst.t)))
stopifnot(all(colnames(ct) == colnames(cg)))
colnames(ct) <- colnames(cg) <- colnames(vst.t) <- colnames(vst.g) <- sub("X","",colnames(cg))
samples <- samples[match(colnames(ct),samples$SampleID),]

4.1 Gather the annotation

df.annot <- data.frame(GeneID=rownames(cg),
                 SYMBOL=mapIds(org.At.tair.db,keys=rownames(cg),"SYMBOL","TAIR"),
                 GENENAME=mapIds(org.At.tair.db,keys=rownames(cg),"GENENAME","TAIR"))
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
tx.annot <- df.annot[match(sub("\\.[0-9]+","",rownames(vst.t)),df.annot$GeneID),]
row.names(tx.annot) <- rownames(vst.t)
tx.annot$TranscriptID <- rownames(vst.t)
tx.annot <- tx.annot[,c(4,1:3)]

4.2 DE at the gene level

outdir <- "analysis/DESeq2/Kallisto-gene/simple-contrasts/pcp"

4.2.1 pcp - C vs CW

sel <- samples$Genotype=="pcp" & samples$Temp %in% c("C","CW")
C.vs.CW <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.2 pcp - C vs W

sel <- samples$Genotype=="pcp" & samples$Temp %in% c("C","W")
C.vs.W <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.3 pcp - CW vs W

sel <- samples$Genotype=="pcp" & samples$Temp %in% c("W","CW")
CW.vs.W <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.4 pcp - C vs WC

sel <- samples$Genotype=="pcp" & samples$Temp %in% c("C","WC")
C.vs.WC <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.5 pcp - W vs WC

sel <- samples$Genotype=="pcp" & samples$Temp %in% c("W","WC")
W.vs.WC <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.6 pcp - Check the overlap of the four sets

plot.new()
grid.draw(venn.diagram(list(
  C.vs.CW=C.vs.CW,
  C.vs.W=C.vs.W,
  CW.vs.W=CW.vs.W,
  C.vs.WC=C.vs.WC,
  W.vs.WC=W.vs.WC
    ),
  filename=NULL,
  col=pal[1:5],
  category.names=c("C.vs.CW","C.vs.W","CW.vs.W","C.vs.WC","W.vs.WC")))

4.2.7 Col0 - C vs CW

outdir <- "analysis/DESeq2/Kallisto-gene/simple-contrasts/Col-0"
sel <- samples$Genotype=="Col-0" & samples$Temp %in% c("C","CW")
C.vs.CW <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.8 Col-0 - C vs W

sel <- samples$Genotype=="Col-0" & samples$Temp %in% c("C","W")
C.vs.W <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.9 Col-0 - CW vs W

sel <- samples$Genotype=="Col-0" & samples$Temp %in% c("W","CW")
CW.vs.W <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.10 Col-0 - C vs WC

sel <- samples$Genotype=="Col-0" & samples$Temp %in% c("C","WC")
C.vs.WC <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.11 Col-0 - W vs WC

sel <- samples$Genotype=="Col-0" & samples$Temp %in% c("W","WC")
W.vs.WC <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.12 Col-0 - Check the overlap of the four sets

plot.new()
grid.draw(venn.diagram(list(
  C.vs.CW=C.vs.CW,
  C.vs.W=C.vs.W,
  CW.vs.W=CW.vs.W,
  C.vs.WC=C.vs.WC,
  W.vs.WC=W.vs.WC
),
filename=NULL,
col=pal[1:5],
category.names=c("C.vs.CW","C.vs.W","CW.vs.W","C.vs.WC","W.vs.WC")))

4.2.13 Temperature

4.2.13.1 Col-0-C vs pcp-C

outdir <- "analysis/DESeq2/Kallisto-gene/simple-contrasts/Cold"
sel <- samples$Temp %in% "C"
Col0.vs.pcp.C <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Genotype[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.13.2 Col-0-W vs pcp-W

outdir <- "analysis/DESeq2/Kallisto-gene/simple-contrasts/Warm"
sel <- samples$Temp %in% "W"
Col0.vs.pcp.W <- .runDE(
  DESeqDataSetFromMatrix(
    countData = cg[,sel],
    colData = data.frame(condition=samples$Genotype[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  df.annot, vst.g, outdir=outdir)

4.2.13.3 Cold - Warm overlap

plot.new()
grid.draw(venn.diagram(list(
  Col0.vs.pcp.C=Col0.vs.pcp.C,
  Col0.vs.pcp.W=Col0.vs.pcp.W
),
filename=NULL,
col=pal[1:2],
category.names=c("Cold","Warm")))

4.3 DE at the transcript level

outdir <- "analysis/DESeq2/Kallisto-transcript/simple-contrasts/pcp"

4.3.1 pcp - C vs CW

sel <- samples$Genotype=="pcp" & samples$Temp %in% c("C","CW")

C.vs.CW <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.2 pcp - C vs W

sel <- samples$Genotype=="pcp" & samples$Temp %in% c("C","W")
C.vs.W <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.3 pcp - CW vs W

sel <- samples$Genotype=="pcp" & samples$Temp %in% c("W","CW")
CW.vs.W <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.4 pcp - C vs WC

sel <- samples$Genotype=="pcp" & samples$Temp %in% c("C","WC")
C.vs.WC <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.5 pcp - W vs WC

sel <- samples$Genotype=="pcp" & samples$Temp %in% c("W","WC")
W.vs.WC <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.6 pcp - Check the overlap of the four sets

plot.new()
grid.draw(venn.diagram(list(
  C.vs.CW=C.vs.CW,
  C.vs.W=C.vs.W,
  CW.vs.W=CW.vs.W,
  C.vs.WC=C.vs.WC,
  W.vs.WC=W.vs.WC
),
filename=NULL,
col=pal[1:5],
category.names=c("C.vs.CW","C.vs.W","CW.vs.W","C.vs.WC","W.vs.WC")))

outdir <- "analysis/DESeq2/Kallisto-transcript/simple-contrasts/Col-0"

4.3.7 Col-0 - C vs CW

sel <- samples$Genotype=="Col-0" & samples$Temp %in% c("C","CW")

C.vs.CW <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.8 Col-0 - C vs W

sel <- samples$Genotype=="Col-0" & samples$Temp %in% c("C","W")
C.vs.W <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.9 Col-0 - CW vs W

sel <- samples$Genotype=="Col-0" & samples$Temp %in% c("W","CW")
C.vs.CW <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.10 Col-0 - C vs WC

sel <- samples$Genotype=="Col-0" & samples$Temp %in% c("C","WC")
C.vs.WC <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.11 Col-0 - W vs WC

sel <- samples$Genotype=="Col-0" & samples$Temp %in% c("W","WC")
W.vs.WC <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Temp[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.12 Col-0 - Check the overlap of the four sets

plot.new()
grid.draw(venn.diagram(list(
  C.vs.CW=C.vs.CW,
  C.vs.W=C.vs.W,
  CW.vs.W=CW.vs.W,
  C.vs.WC=C.vs.WC,
  W.vs.WC=W.vs.WC
),
filename=NULL,
col=pal[1:5],
category.names=c("C.vs.CW","C.vs.W","CW.vs.W","C.vs.WC","W.vs.WC")))

4.3.13 Temperature

4.3.13.1 pcp-C vs Col-0-C

outdir <- "analysis/DESeq2/Kallisto-transcript/simple-contrasts/Cold"
sel <- samples$Temp %in% "C"
Col0.vs.pcp.C <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Genotype[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.13.2 pcp-W vs Col-0-W

outdir <- "analysis/DESeq2/Kallisto-transcript/simple-contrasts/Warm"
sel <- samples$Temp %in% "W"
Col0.vs.pcp.W <- .runDE(
  DESeqDataSetFromMatrix(
    countData = ct[,sel],
    colData = data.frame(condition=samples$Genotype[sel],
                         replicate=samples$Replicate[sel],
                         ID=samples$SampleID[sel]),
    design = ~condition),
  tx.annot, vst.t, outdir=outdir,type="transcript")

4.3.13.3 Cold - Warm overlap

plot.new()
grid.draw(venn.diagram(list(
  Col0.vs.pcp.C=Col0.vs.pcp.C,
  Col0.vs.pcp.W=Col0.vs.pcp.W
),
filename=NULL,
col=pal[1:2],
category.names=c("Cold","Warm")))

5 Session Info

## R version 3.3.2 (2016-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.1 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] VennDiagram_1.6.17         futile.logger_1.4.3       
##  [3] RColorBrewer_1.1-2         pander_0.6.0              
##  [5] org.At.tair.db_3.4.0       AnnotationDbi_1.36.0      
##  [7] LSD_3.0                    Glimma_1.2.1              
##  [9] gplots_3.0.1               DESeq2_1.14.1             
## [11] SummarizedExperiment_1.4.0 Biobase_2.34.0            
## [13] GenomicRanges_1.26.1       GenomeInfoDb_1.10.1       
## [15] IRanges_2.8.1              S4Vectors_0.12.1          
## [17] BiocGenerics_0.20.0       
## 
## loaded via a namespace (and not attached):
##  [1] base64_2.0           Rcpp_0.12.8          locfit_1.5-9.1      
##  [4] lattice_0.20-34      gtools_3.5.0         assertthat_0.1      
##  [7] rprojroot_1.1        digest_0.6.11        plyr_1.8.4          
## [10] futile.options_1.0.0 backports_1.0.4      acepack_1.4.1       
## [13] RSQLite_1.1-1        evaluate_0.10        ggplot2_2.2.1       
## [16] zlibbioc_1.20.0      lazyeval_0.2.0       data.table_1.10.0   
## [19] annotate_1.52.0      gdata_2.17.0         rpart_4.1-10        
## [22] Matrix_1.2-7.1       rmarkdown_1.2        splines_3.3.2       
## [25] BiocParallel_1.8.1   geneplotter_1.52.0   stringr_1.1.0       
## [28] foreign_0.8-67       RCurl_1.95-4.8       munsell_0.4.3       
## [31] htmltools_0.3.5      nnet_7.3-12          openssl_0.9.5       
## [34] tibble_1.2           gridExtra_2.2.1      htmlTable_1.7       
## [37] edgeR_3.16.5         Hmisc_4.0-1          XML_3.98-1.5        
## [40] bitops_1.0-6         xtable_1.8-2         gtable_0.2.0        
## [43] DBI_0.5-1            magrittr_1.5         scales_0.4.1        
## [46] KernSmooth_2.23-15   stringi_1.1.2        XVector_0.14.0      
## [49] genefilter_1.56.0    limma_3.30.7         latticeExtra_0.6-28 
## [52] Formula_1.2-1        lambda.r_1.1.9       tools_3.3.2         
## [55] survival_2.40-1      yaml_2.1.14          colorspace_1.3-2    
## [58] cluster_2.0.5        caTools_1.17.1       memoise_1.0.0       
## [61] knitr_1.15.1